The data consists of vegetation % cover by functional group from across CONUS (from AIM, FIA, LANDFIRE, and RAP), as well as climate variables from DayMet, which have been aggregated into mean interannual conditions accross multiple temporal windows.
Set user defined parameters
print(params)
## $autoKfold
## [1] FALSE
##
## $run
## [1] TRUE
##
## $save_figs
## [1] FALSE
##
## $trimAnomalies
## [1] TRUE
##
## $ecoregion
## [1] "trees"
##
## $response
## [1] "ForbCover_prop"
##
## $removeTexasLouisianaPlain
## [1] FALSE
##
## $removeAllAnoms
## [1] FALSE
##
## $whichSecondBestMod
## [1] "auto"
# set to true if want to run for a limited number of rows (i.e. for code testing)
test_run <- params$test_run
save_figs <- params$save_figs
response <- params$response
fit_sample <- TRUE # fit model to a sample of the data
n_train <- 5e4 # sample size of the training data
n_test <- 1e6 # sample size of the testing data (if this is too big the decile dotplot code throws memory errors)
trimAnom <- params$trimAnomalies
removeTLP <- params$removeTexasLouisianaPlain
run <- params$run
autoKfold <- params$autoKfold
removeAllAnoms <- params$removeAllAnoms
whichSecondBestMod <- params$whichSecondBestMod
Load packages and source functions
# set option so resampled dataset created here reproduces earlier runs of this code with dplyr 1.0.10
source("../../../Functions/glmTransformsIterates.R")
source("../../../Functions/transformPreds.R")
source("../../../Functions/betaLASSO.R")
#source("../../../Functions/StepBeta_mine.R")
#source("src/fig_params.R")
#source("src/modeling_functions.R")
library(betareg)
library(ggspatial)
library(terra)
library(tidyterra)
library(sf)
library(caret)
library(tidyverse)
library(GGally) # for ggpairs()
library(pdp) # for partial dependence plots
library(gridExtra)
library(patchwork) # for figure insets etc.
library(ggtext)
library(StepBeta)
theme_set(theme_classic())
library(here)
library(rsample)
library(kableExtra)
library(glmnet)
library(USA.state.boundaries)
Data compiled in the prepDataForModels.R script
here::i_am("Analysis/VegComposition/ModelFitting/03_modelFitting_testingBetaLASSO.Rmd")
# get data that already have
modDat <- readRDS( here("Data_processed", "CoverData", "CoverModelData_withTreeNoTreePreds.rds")) %>% st_drop_geometry()
# change Level 1 cover variables to be between 0 and 1 rather than 0 and 100
modDat[,"TotalTreeCover"] <- modDat[,"TotalTreeCover"]/100
modDat[,"TotalHerbaceousCover"] <- modDat[,"TotalHerbaceousCover"]/100
modDat[,"BareGroundCover"] <- modDat[,"BareGroundCover"]/100
modDat[,"ShrubCover"] <- modDat[,"ShrubCover"]/100
# For all response variables, make sure there are no 0s add .0001 from each, since the Beta model framework can't handle that
modDat[modDat$TotalTreeCover == 0 & !is.na(modDat$TotalTreeCover), "TotalTreeCover"] <- 0.0001
modDat[modDat$TotalHerbaceousCover == 0 & !is.na(modDat$TotalHerbaceousCover), "TotalHerbaceousCover"] <- 0.0001
modDat[modDat$BareGroundCover == 0 & !is.na(modDat$BareGroundCover), "BareGroundCover"] <- 0.0001
modDat[modDat$ShrubCover == 0 & !is.na(modDat$ShrubCover), "ShrubCover"] <- 0.0001
modDat[modDat$AngioTreeCover_prop == 0 & !is.na(modDat$AngioTreeCover_prop), "AngioTreeCover_prop"] <- 0.0001
modDat[modDat$ConifTreeCover_prop == 0 & !is.na(modDat$ConifTreeCover_prop), "ConifTreeCover_prop"] <- 0.0001
modDat[modDat$C4GramCover_prop == 0 & !is.na(modDat$C4GramCover_prop), "C4GramCover_prop"] <- 0.0001
modDat[modDat$C3GramCover_prop == 0 & !is.na(modDat$C3GramCover_prop), "C3GramCover_prop"] <- 0.0001
modDat[modDat$ForbCover_prop == 0 & !is.na(modDat$ForbCover_prop), "ForbCover_prop"] <- 0.0001
# For all response variables, make sure there are no values of 1 -- subtract .0001 from each, since the Beta model framework can't handle that
modDat[modDat$TotalTreeCover == 1 & !is.na(modDat$TotalTreeCover), "TotalTreeCover"] <- 0.9999
modDat[modDat$TotalHerbaceousCover == 1 & !is.na(modDat$TotalHerbaceousCover), "TotalHerbaceousCover"] <- 0.9999
modDat[modDat$BareGroundCover == 1 & !is.na(modDat$BareGroundCover), "BareGroundCover"] <- 0.9999
modDat[modDat$ShrubCover == 1 & !is.na(modDat$ShrubCover), "ShrubCover"] <- 0.9999
modDat[modDat$AngioTreeCover_prop == 1 & !is.na(modDat$AngioTreeCover_prop), "AngioTreeCover_prop"] <- 0.9999
modDat[modDat$ConifTreeCover_prop == 1 & !is.na(modDat$ConifTreeCover_prop), "ConifTreeCover_prop"] <- 0.9999
modDat[modDat$C4GramCover_prop == 1 & !is.na(modDat$C4GramCover_prop), "C4GramCover_prop"] <- 0.9999
modDat[modDat$C3GramCover_prop == 1 & !is.na(modDat$C3GramCover_prop), "C3GramCover_prop"] <- 0.9999
modDat[modDat$ForbCover_prop == 1 & !is.na(modDat$ForbCover_prop), "ForbCover_prop"] <- 0.9999
ecoregion <- params$ecoregion
response <- params$response
print(paste0("In this model run, the ecoregion is ", ecoregion," and the response variable is ",response))
## [1] "In this model run, the ecoregion is trees and the response variable is ForbCover_prop"
The following are the candidate predictor variables for this ecoregion:
if (ecoregion == "noTrees") {
# select potential predictor variables for the ecoregion of interest
prednames <-
c(
"tmean" , "prcp" , "prcp_dry" , "prcp_seasonality" ,"prcpTempCorr" ,"isothermality" ,
"clay" , "coarse" , "carbon" , "AWHC" ,"tmin_anom" ,"tmax_anom" ,
"t_warm_anom" , "prcp_wet_anom" , "precp_dry_anom", "prcp_seasonality_anom" ,"prcpTempCorr_anom","isothermality_anom",
"annWatDef_anom", "annWetDegDays_anom", "VPD_min_anom" , "frostFreeDays_anom")
} else if (ecoregion == "trees") {
# select potential predictor variables for the ecoregion of interest
prednames <-
c(
"tmean" , "prcp" , "prcp_dry" , "prcpTempCorr" ,"isothermality" , "annWatDef" ,
"sand" , "coarse" , "carbon" , "AWHC" ,"tmin_anom" , "tmax_anom" ,
"t_warm_anom" , "prcp_wet_anom" , "precp_dry_anom" , "prcp_seasonality_anom" ,"prcpTempCorr_anom", "aboveFreezingMonth_anom",
"isothermality_anom", "annWatDef_anom", "annWetDegDays_anom", "VPD_min_anom"
)
} else if (ecoregion == "CONUS") {
prednames <- c(
"tmean" , "prcp" , "prcp_seasonality" , "prcpTempCorr" , "isothermality" ,"annWetDegDays" ,
"sand" , "coarse" , "AWHC" , "tmin_anom" , "tmax_anom" ,"prcp_anom",
"t_warm_anom" , "precp_dry_anom" , "prcp_seasonality_anom", "prcpTempCorr_anom", "aboveFreezingMonth_anom", "isothermality_anom",
"annWatDef_anom", "annWetDegDays_anom", "VPD_min_anom"
)
}
modDat_1 <- modDat
allPreds <- modDat_1 %>%
dplyr::select(tmin:frostFreeDays,tmean_anom:frostFreeDays_anom, soilDepth:AWHC) %>%
names()
modDat_1_s <- modDat_1 %>%
mutate(across(all_of(allPreds), base::scale, .names = "{.col}_s"))
saveRDS(modDat_1_s, file = "./models/scaledModelInputData.rds")
if (ecoregion == "noTrees") {
# select data for the ecoregion of interest
modDat_1_s <- modDat_1_s %>%
filter(treeProb_bestLambda_binary == 0)
} else if (ecoregion == "trees") {
# select data for the ecoregion of interest
modDat_1_s <- modDat_1_s %>%
filter(treeProb_bestLambda_binary == 1)
} else if (ecoregion == "CONUS") {
modDat_1_s <- modDat_1_s
}
# remove the rows that have no observations for the response variable of interest
modDat_1_s <- modDat_1_s[!is.na(modDat_1_s[,response]),]
# subset the data to only include these predictors, and remove any remaining NAs
modDat_1_s <- modDat_1_s %>%
dplyr::select(prednames, paste0(prednames, "_s"), response, newRegion, Year, x, y, NA_L1NAME, NA_L2NAME) %>%
drop_na()
names(prednames) <- prednames
df_pred <- modDat_1_s[, prednames]
hist(modDat_1_s[,response], main = paste0("Histogram of ",response),
xlab = paste0(response))
create_summary <- function(df) {
df %>%
pivot_longer(cols = everything(),
names_to = 'variable') %>%
group_by(variable) %>%
summarise(across(value, .fns = list(mean = ~mean(.x, na.rm = TRUE), min = ~min(.x, na.rm = TRUE),
median = ~median(.x, na.rm = TRUE), max = ~max(.x, na.rm = TRUE)))) %>%
mutate(across(where(is.numeric), round, 4))
}
modDat_1_s[prednames] %>%
create_summary() %>%
knitr::kable(caption = 'summaries of possible predictor variables') %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
| variable | value_mean | value_min | value_median | value_max |
|---|---|---|---|---|
| AWHC | 12.1813 | 0.9085 | 11.5255 | 34.6680 |
| VPD_min_anom | -0.0093 | -0.1953 | -0.0106 | 0.1008 |
| aboveFreezingMonth_anom | 0.1329 | -1.8900 | 0.0726 | 2.7016 |
| annWatDef | 49.4091 | 0.0000 | 38.5912 | 273.9773 |
| annWatDef_anom | -0.0382 | -6.7485 | -0.0529 | 1.0000 |
| annWetDegDays_anom | -0.0415 | -1.2671 | -0.0371 | 0.7184 |
| carbon | 8.3074 | 0.1947 | 4.9236 | 48.1381 |
| coarse | 19.9895 | 0.0000 | 19.0080 | 64.1122 |
| isothermality | 36.7083 | 22.7143 | 36.5775 | 59.9360 |
| isothermality_anom | 1.1730 | -8.0979 | 1.0958 | 9.2460 |
| prcp | 1048.6996 | 187.1626 | 889.4758 | 4360.3490 |
| prcpTempCorr | -0.3303 | -0.8510 | -0.3475 | 0.6975 |
| prcpTempCorr_anom | -0.0037 | -0.5888 | 0.0032 | 0.5459 |
| prcp_dry | 11.5305 | 0.0000 | 6.7477 | 66.6148 |
| prcp_seasonality_anom | 0.0157 | -0.6081 | 0.0206 | 0.3656 |
| prcp_wet_anom | 0.0009 | -0.8031 | 0.0080 | 0.6248 |
| precp_dry_anom | -0.0122 | -6.7500 | 0.0622 | 1.0000 |
| sand | 46.2068 | 1.3984 | 45.4349 | 99.9418 |
| t_warm_anom | -0.3246 | -5.5689 | -0.3307 | 3.3810 |
| tmax_anom | -0.2364 | -5.1921 | -0.2411 | 3.1773 |
| tmean | 7.9898 | -2.0495 | 7.3397 | 22.5542 |
| tmin_anom | -0.7134 | -4.1413 | -0.6817 | 2.9385 |
scaleFigDat_1 <- modDat_1_s %>%
dplyr::select(c(x, y, Year, prednames)) %>%
pivot_longer(cols = all_of(names(prednames)),
names_to = "predNames",
values_to = "predValues_unScaled")
scaleFigDat_2 <- modDat_1_s %>%
dplyr::select(c(x, y, Year, paste0(prednames, "_s"))) %>%
pivot_longer(cols = all_of(paste0(prednames,"_s"
)),
names_to = "predNames",
values_to = "predValues_scaled",
names_sep = ) %>%
mutate(predNames = str_replace(predNames, pattern = "_s$", replacement = ""))
scaleFigDat_3 <- scaleFigDat_1 %>%
left_join(scaleFigDat_2)
ggplot(scaleFigDat_3) +
facet_wrap(~predNames, scales = "free") +
geom_histogram(aes(predValues_unScaled), fill = "lightgrey", col = "darkgrey") +
geom_histogram(aes(predValues_scaled), fill = "lightblue", col = "blue") +
xlab ("predictor variable values") +
ggtitle("Comparing the distribution of unscaled (grey) to scaled (blue) predictor variables")
modDat_1_s <- modDat_1_s %>%
rename_with(~paste0(.x, "_raw"),
any_of(names(prednames))) %>%
rename_with(~str_remove(.x, "_s$"),
any_of(paste0(names(prednames), "_s")))
set.seed(12011993)
# vector of name of response variables
vars_response <- response
# longformat dataframes for making boxplots
df_sample_plots <- modDat_1_s %>%
slice_sample(n = 5e4) %>%
rename(response = all_of(response)) %>%
mutate(response = case_when(
response <= .25 ~ ".25",
response > .25 & response <=.50 ~ ".50",
response > .50 & response <=.75 ~ ".75",
response >= .75 ~ "1",
)) %>%
dplyr::select(c(response, prednames)) %>%
tidyr::pivot_longer(cols = unname(prednames),
names_to = "predictor",
values_to = "value"
)
ggplot(df_sample_plots, aes_string(x= "response", y = 'value')) +
geom_boxplot() +
facet_wrap(~predictor , scales = 'free_y') +
ylab("Predictor Variable Values") +
xlab(response)
First, if there are observations in Louisiana, sub-sample them so they’re not so over-represented in the dataset
## make data into spatial format
modDat_1_sf <- modDat_1_s %>%
st_as_sf(coords = c("x", "y"), crs = st_crs("EPSG:4326"))
# download map info for visualization
data(state_boundaries_wgs84)
cropped_states <- suppressMessages(state_boundaries_wgs84 %>%
dplyr::filter(NAME!="Hawaii") %>%
dplyr::filter(NAME!="Alaska") %>%
dplyr::filter(NAME!="Puerto Rico") %>%
dplyr::filter(NAME!="American Samoa") %>%
dplyr::filter(NAME!="Guam") %>%
dplyr::filter(NAME!="Commonwealth of the Northern Mariana Islands") %>%
dplyr::filter(NAME!="United States Virgin Islands") %>%
sf::st_sf() %>%
sf::st_transform(sf::st_crs(modDat_1_sf))) #%>%
#sf::st_crop(sf::st_bbox(modDat_1_sf)+c(-1,-1,1,1))
if (ecoregion %in% c("Forest", "eastForest", "forest")){
modDat_1_s$uniqueID <- 1:nrow(modDat_1_s)
modDat_1_sf$uniqueID <- 1:nrow(modDat_1_sf)
# find which observations overlap with Louisiana
obs_LA_temp <- st_intersects(modDat_1_sf, cropped_states[cropped_states$NAME == "Louisiana",], sparse = FALSE)
if (sum(obs_LA_temp) > 0) {
obs_LA_1 <- modDat_1_sf[which(obs_LA_temp == TRUE, arr.ind = TRUE)[,1],]
# now, find only those within the oversampled area (near the coast)
dims <- c(xmin = 730439.1, xmax = 1042195.5, ymax = -1222745.2, ymin = -1390430.9)
badBox <- st_bbox(dims) %>%
st_as_sfc() %>%
st_set_crs(value = st_crs(modDat_1_sf))
obs_LA_temp2 <- st_intersects(obs_LA_1, badBox, sparse = FALSE)
obs_LA_2 <- obs_LA_1[which(obs_LA_temp2 == TRUE, arr.ind = TRUE)[,1],]
# subsample so there aren't so many
# get every 7th observation
obs_LA_sampled <- obs_LA_2[seq(from = 1, to = nrow(obs_LA_2), by = 10),]
# remove observations that aren't sampled from the larger dataset
obsToRemove <- obs_LA_2[!(obs_LA_2$uniqueID %in% obs_LA_sampled$uniqueID),]
modDat_1_sf <- modDat_1_sf[!(modDat_1_sf$uniqueID %in% obsToRemove$uniqueID),]
modDat_1_s <- modDat_1_s[!(modDat_1_s$uniqueID %in% obsToRemove$uniqueID),]
}
}
## do a pca of climate across level 2 ecoregions
pca <- prcomp(modDat_1_s[,paste0(prednames)])
library(factoextra)
(fviz_pca_ind(pca, habillage = modDat_1_s$NA_L2NAME, label = "none", addEllipses = TRUE, ellipse.level = .95, ggtheme = theme_minimal(), alpha.ind = .1))
# for ecoregions that are very tiny/extremely different from the rest of the other recoregions climatically, aggregate them together
if(ecoregion == "noTrees") {
modDat_1_s[modDat_1_s$NA_L2NAME %in% c("EVERGLADES", "SOUTHEASTERN USA PLAINS", "TEXAS-LOUISIANA COASTAL PLAIN", "MISSISSIPPI ALLUVIAL AND SOUTHEAST USA COASTAL PLAINS", "TAMAULIPAS-TEXAS SEMIARID PLAIN"), "NA_L2NAME"] <- "SOUTHEASTERN USA PLAINS AND EVERGLADES AND LOUISIANA PLAIN"
modDat_1_sf[modDat_1_sf$NA_L2NAME %in% c("EVERGLADES", "SOUTHEASTERN USA PLAINS", "TEXAS-LOUISIANA COASTAL PLAIN", "MISSISSIPPI ALLUVIAL AND SOUTHEAST USA COASTAL PLAINS", "TAMAULIPAS-TEXAS SEMIARID PLAIN"), "NA_L2NAME"] <- "SOUTHEASTERN USA PLAINS AND EVERGLADES AND LOUISIANA PLAIN"
}
# make a table of n for each region
modDat_1_s %>%
group_by(NA_L2NAME) %>%
dplyr::summarize("Number_Of_Observations" = length(NA_L2NAME)) %>%
rename("Level_2_Ecoregion" = NA_L2NAME)%>%
kable() %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
| Level_2_Ecoregion | Number_Of_Observations |
|---|---|
| ATLANTIC HIGHLANDS | 939 |
| CENTRAL USA PLAINS | 74 |
| COLD DESERTS | 5751 |
| MARINE WEST COAST FOREST | 2538 |
| MEDITERRANEAN CALIFORNIA | 766 |
| MISSISSIPPI ALLUVIAL AND SOUTHEAST USA COASTAL PLAINS | 771 |
| MIXED WOOD PLAINS | 942 |
| MIXED WOOD SHIELD | 656 |
| OZARK/OUACHITA-APPALACHIAN FORESTS | 1796 |
| SOUTH CENTRAL SEMIARID PRAIRIES | 190 |
| SOUTHEASTERN USA PLAINS | 2077 |
| TAMAULIPAS-TEXAS SEMIARID PLAIN | 5 |
| TEMPERATE PRAIRIES | 55 |
| TEXAS-LOUISIANA COASTAL PLAIN | 114 |
| UPPER GILA MOUNTAINS | 1730 |
| WARM DESERTS | 76 |
| WEST-CENTRAL SEMIARID PRAIRIES | 79 |
| WESTERN CORDILLERA | 19732 |
| WESTERN SIERRA MADRE PIEDMONT | 152 |
map1 <- ggplot() +
geom_sf(data=cropped_states,fill='white') +
geom_sf(data=modDat_1_sf#[modDat_1_sf$NA_L2NAME %in% c("MIXED WOOD PLAINS"),]
,
aes(fill=as.factor(NA_L2NAME)),linewidth=0.5,alpha=0.5) +
geom_point(data=modDat_1_s#[modDat_1_s$NA_L2NAME %in% c("MIXED WOOD PLAINS"),]
,
alpha=0.5,
aes(x = x, y = y, color=as.factor(NA_L2NAME)), alpha = .1) +
#scale_fill_okabeito() +
#scale_color_okabeito() +
# theme_default() +
theme(legend.position = 'none') +
labs(title = "Level 2 Ecoregions as spatial blocks")
hull <- modDat_1_sf %>%
ungroup() %>%
group_by(NA_L2NAME) %>%
slice(chull(tmean, prcp))
plot1<-ggplot(data=modDat_1_sf,aes(x=tmean,y=prcp)) +
geom_polygon(data = hull, alpha = 0.25,aes(fill=NA_L2NAME) )+
geom_point(aes(group=NA_L2NAME,color=NA_L2NAME),alpha=0.25) +
theme_minimal() + xlab("Annual Average T_mean - long-term average") +
ylab("Annual Average Precip - long-term average") #+
#scale_color_okabeito() +
#scale_fill_okabeito()
plot2<-ggplot(data=modDat_1_sf %>%
pivot_longer(cols=tmean:prcp),
aes(x=value,group=name)) +
# geom_polygon(data = hull, alpha = 0.25,aes(fill=fold) )+
geom_density(aes(group=NA_L2NAME,fill=NA_L2NAME),alpha=0.25) +
theme_minimal() +
facet_wrap(~name,scales='free')# +
#scale_color_okabeito() +
#scale_fill_okabeito()
library(patchwork)
(combo <- (map1+plot1)/plot2)
#
# ggplot(data = modDat_1_s) +
# geom_density(aes(ShrubCover_transformed, col = NA_L2NAME)) +
# xlim(c(0,100))
Get rid of transformed predictions and interactions that are correlated
# get first pass of names correlated variables
X_df <- X %>%
as.data.frame() %>%
dplyr::select(-'(Intercept)')
corrNames_i <- X_df %>%
cor() %>%
caret::findCorrelation(cutoff = .7, verbose = FALSE, names = TRUE, exact = TRUE)
# remove those names that are untransformed main effects
# vector of columns to remove
badNames <- corrNames_i[!(corrNames_i %in% prednames)]
while (sum(!(corrNames_i %in% prednames))>0) {
corrNames_i <- X_df %>%
dplyr::select(-badNames) %>%
cor() %>%
caret::findCorrelation(cutoff = .7, verbose = FALSE, names = TRUE, exact = TRUE)
# update the vector of names to remove
badNames <- unique(c(badNames, corrNames_i[!(corrNames_i %in% prednames)]))
}
## see if there are any correlated variables left (would be all main effects at this point)
# if there are, step through and remove the variable that each is most correlated with
if (length(corrNames_i)>1) {
for (i in 1:length(corrNames_i)) {
X_i <- X_df %>%
dplyr::select(-badNames)
if (corrNames_i[i] %in% names(X_i)) {
corMat_i <- cor(x = X_i[corrNames_i[i]], y = X_i %>% dplyr::select(-corrNames_i[i]))
badNames_i <- colnames(corMat_i)[abs(corMat_i)>=.7]
# if there are any predictors in the 'badNames_i', remove them from this list
if (length(badNames_i) > 0 & sum(c(badNames_i %in% prednames))>0) {
badNames_i <- badNames_i[!(badNames_i %in% prednames)]
}
badNames <- unique(c(badNames, badNames_i))
}
}
}
## update the X matrix to exclude these correlated variables
X <- X[,!(colnames(X) %in% badNames)]
if (run == TRUE) {
# set up custom folds
# get the ecoregions for training lambda
train_eco <- modDat_1_s$NA_L2NAME#[train]
# Fit model -----------------------------------------------
# specify leave-one-year-out cross-validation
my_folds <- as.numeric(as.factor(train_eco))
# set up parallel processing
library(doMC)
# this computer has 16 cores (parallel::detectCores())
registerDoMC(cores = 8)
fit <- cv.glmnet(
x = X[,2:ncol(X)],
y = y,
family = quasibeta_family(),
keep = FALSE,
alpha = 1, # 0 == ridge regression, 1 == lasso, 0.5 ~~ elastic net
lambda = lambdas,
relax = ifelse(response == "ShrubCover", yes = TRUE, no = FALSE),
#nlambda = 100,
type.measure="mse",
#penalty.factor = pen_facts,
foldid = my_folds,
#thresh = thresh,
standardize = FALSE, ## scales variables prior to the model sequence... coefficients are always returned on the original scale
parallel = TRUE
)
base::saveRDS(fit, paste0("../ModelFitting/models/betaLASSO/", response, "_globalLASSOmod_betaLogitLink_",ecoregion,"_removeAnomalies", removeAllAnoms,".rds"))
best_lambda <- fit$lambda.min
# save the lambda for the most regularized model w/ an MSE that is still 1SE w/in the best lambda model
lambda_1SE <- fit$lambda.1se
# save the lambda for the most regularized model w/ an MSE that is still .5SE w/in the best lambda model
lambda_halfSE <- best_lambda + ((lambda_1SE - best_lambda)/2)
## Now, we need to do stability selection to ensure the coefficients that are being chosen with each lambda are stable
## stability selection for best lambda model
# setup params
p <- ncol(X[,2:ncol(X)]) # of parameters
n <- length(y) # of observations
n_iter <- 100 # number of subsamples
sample_frac <- 0.75 # fraction of data to subsample
lambda_val <- best_lambda # fixed lambda value (could be chosen via CV)
# Track selection
selection_counts_bestL <- matrix(0, nrow = p, ncol = 1)
for (i in 1:n_iter) {
# Subsample rows
sample_idx <- sample(1:n, size = floor(sample_frac * n), replace = FALSE)
X_sub <- X[sample_idx, ]
y_sub <- y[sample_idx]
# Fit Lasso model
fit_stab_i <- glmnet(x = X_sub[,2:ncol(X_sub)], y = y_sub,
family = quasibeta_family(),
alpha = 1, lambda = lambda_val, standardize = FALSE)
# Get non-zero coefficients (excluding intercept)
select_bestL <- which(as.vector(coef(fit_stab_i))[-1] != 0)
selection_counts_bestL[select_bestL] <- selection_counts_bestL[select_bestL] + 1
}
# Convert counts to selection probabilities (the probability that a variable is selected over 100 iterations)
selection_prob_bestL <- selection_counts_bestL / n_iter
selection_prob_bestL_df <- data.frame(
VariableNumber = paste0("X", 1:p),
VariableName = rownames(coef(fit_stab_i))[2:(p+1)],
SelectionProb = as.vector(selection_prob_bestL)
)
# get those variables that are selected in ≥70% of subsets (Meinshausen and Bühlmann, 2010)
bestLambda_coef <- selection_prob_bestL_df[selection_prob_bestL_df$SelectionProb>=.7, c("VariableName", "SelectionProb")]
#//////
# stability selection for 1se lambda model
lambda_val <- lambda_1SE # fixed lambda value (could be chosen via CV)
# Track selection
selection_counts_1seL <- matrix(0, nrow = p, ncol = 1)
for (i in 1:n_iter) {
# Subsample rows
sample_idx <- sample(1:n, size = floor(sample_frac * n), replace = FALSE)
X_sub <- X[sample_idx, ]
y_sub <- y[sample_idx]
# Fit Lasso model
fit_stab_i <- glmnet(x = X_sub[,2:ncol(X_sub)], y = y_sub,
family = quasibeta_family(),
alpha = 1, lambda = lambda_val, standardize = FALSE)
# Get non-zero coefficients (excluding intercept)
selected_1seL <- which(as.vector(coef(fit_stab_i))[-1] != 0)
selection_counts_1seL[selected_1seL] <- selection_counts_1seL[selected_1seL] + 1
}
# Convert counts to selection probabilities (the probability that a variable is selected over 100 iterations)
selection_prob_1seL <- selection_counts_1seL / n_iter
selection_prob_1seL_df <- data.frame(
VariableNumber = paste0("X", 1:p),
VariableName = rownames(coef(fit_stab_i))[2:(p+1)],
SelectionProb = as.vector(selection_prob_1seL)
)
# get those variables that are selected in ≥70% of subsets (Meinshausen and Bühlmann, 2010)
seLambda_coef <- selection_prob_1seL_df[selection_prob_1seL_df$SelectionProb>=.7, c("VariableName", "SelectionProb")]
# stability selection for half se lambda model
lambda_val <- lambda_halfSE # fixed lambda value (could be chosen via CV)
# Track selection
selection_counts_halfseL <- matrix(0, nrow = p, ncol = 1)
for (i in 1:n_iter) {
# Subsample rows
sample_idx <- sample(1:n, size = floor(sample_frac * n), replace = FALSE)
X_sub <- X[sample_idx, ]
y_sub <- y[sample_idx]
# Fit Lasso model
fit_stab_i <- glmnet(x = X_sub[,2:ncol(X_sub)], y = y_sub,
family = quasibeta_family(),
alpha = 1, lambda = lambda_val, standardize = FALSE)
# Get non-zero coefficients (excluding intercept)
selected_halfseL <- which(as.vector(coef(fit_stab_i))[-1] != 0)
selection_counts_halfseL[selected_halfseL] <- selection_counts_halfseL[selected_halfseL] + 1
}
# Convert counts to selection probabilities (the probability that a variable is selected_halfseL over 100 iterations)
selection_prob_halfseL <- selection_counts_halfseL / n_iter
selection_prob_halfseL_df <- data.frame(
VariableNumber = paste0("X", 1:p),
VariableName = rownames(coef(fit_stab_i))[2:(p+1)],
SelectionProb = as.vector(selection_prob_halfseL)
)
# get those variables that are selected_halfseL_halfseL in ≥70% of subsets (Meinshausen and Bühlmann, 2010)
halfseLambda_coef <- selection_prob_halfseL_df[selection_prob_halfseL_df$SelectionProb>=.7, c("VariableName", "SelectionProb")]
## If prompted to do so by the input arguments, remove any predictors that are for weather anomalies whose corresponding climate predictor is not included in the model
if (trimAnom == TRUE) {
# get unique predictors
bestLamb_all <- bestLambda_coef$VariableName %>% #[str_detect(bestLambda_coef$VariableName, pattern = "_anom_s")] %>%
str_remove(pattern = "I\\(") %>%
str_remove(pattern = "\\^2\\)") %>%
str_remove(pattern = "\\^2\\)") %>%
str_split(pattern = ":", simplify = TRUE)
if (ncol(bestLamb_all) == 1) {
bestLamb_all <- unique(bestLamb_all)
} else {
bestLamb_all <- c(bestLamb_all[bestLamb_all[,1] !="",1], bestLamb_all[bestLamb_all[,2] !="",2]) %>%
unique()
}
# get anomalies
bestLamb_anom <- bestLamb_all[bestLamb_all %in% prednames_weath]
# get climate
bestLamb_clim <- bestLamb_all[bestLamb_all %in% prednames_clim]
# which anomalies are present in the predictors, but their corresponding climate variable isn't
bestLamb_anomsWithMissingClim <- bestLamb_anom[!(str_remove(bestLamb_anom, "_anom") %in% bestLamb_clim)]
# remove anomalies (and all interaction terms w/ those anomalies) from the predictor list
if (length(bestLamb_anomsWithMissingClim) != 0) {
bestLambda_coef_NEW <- bestLambda_coef
for (i in 1:length(bestLamb_anomsWithMissingClim)) {
bestLambda_coef_NEW <- bestLambda_coef_NEW[!str_detect(bestLambda_coef_NEW$VariableName, pattern = bestLamb_anomsWithMissingClim[i]),]
}
bestLambda_coef <- bestLambda_coef_NEW
}
#//// 1 se lambda model
if (nrow(seLambda_coef) !=0) {
# get unique predictors
seLamb_all <- seLambda_coef$VariableName %>% #[str_detect(seLambda_coef$VariableName, pattern = "_anom_s")] %>%
str_remove(pattern = "I\\(") %>%
str_remove(pattern = "_s\\^2\\)") %>%
str_remove(pattern = "\\^2\\)") %>%
str_split(pattern = ":", simplify = TRUE)
if (ncol(seLamb_all) == 1) {
seLamb_all <- unique(seLamb_all)
} else {
seLamb_all <- c(seLamb_all[seLamb_all[,1] !="",1], seLamb_all[seLamb_all[,2] !="",2]) %>%
unique()
}
# get anomalies
seLamb_anom <- seLamb_all[seLamb_all %in% prednames_weath]
# get climate
seLamb_clim <- seLamb_all[seLamb_all %in% prednames_clim]
# which anomalies are present in the predictors, but their corresponding climate variable isn't
seLamb_anomsWithMissingClim <- seLamb_anom[!(str_remove(seLamb_anom, "_anom") %in% seLamb_clim)]
# remove anomalies (and all interaction terms w/ those anomalies) from the predictor list
if (length(seLamb_anomsWithMissingClim) != 0) {
seLambda_coef_NEW <- seLambda_coef
for (i in 1:length(seLamb_anomsWithMissingClim)) {
seLambda_coef_NEW <- seLambda_coef_NEW[!str_detect(seLambda_coef_NEW$VariableName, pattern = seLamb_anomsWithMissingClim[i]),]
}
seLambda_coef <- seLambda_coef_NEW
}
}
#//// 1 se lambda model
if (nrow(halfseLambda_coef) !=0) {
# get unique predictors
halfseLamball<- halfseLambda_coef$VariableName %>% #[str_detect(halfseLambda_coef$VariableName, pattern = "_anom_s")] %>%
str_remove(pattern = "I\\(") %>%
str_remove(pattern = "_s\\^2\\)") %>%
str_remove(pattern = "\\^2\\)") %>%
str_split(pattern = ":", simplify = TRUE)
if (ncol(halfseLamball) == 1) {
halfseLamball <- unique(halfseLamball)
} else {
halfseLamball <- c(halfseLamball[halfseLamball[,1] !="",1], halfseLamball[halfseLamball[,2] !="",2]) %>%
unique()
}
# get anomalies
halfseLambanom <- halfseLamball[halfseLamball %in% prednames_weath]
# get climate
halfseLambclim <- halfseLamball[halfseLamball %in% prednames_clim]
# which anomalies are present in the predictors, but their corresponding climate variable isn't
halfseLambanomsWithMissingClim <- halfseLambanom[!(str_remove(halfseLambanom, "_anom_s") %in% halfseLambclim)]
# remove anomalies (and all interaction terms w/ those anomalies) from the predictor list
##
if (length(halfseLambanomsWithMissingClim) != 0) {
halfseLambda_coef_NEW <- halfseLambda_coef
for (i in 1:length(halfseLambanomsWithMissingClim)) {
halfseLambda_coef_NEW <- halfseLambda_coef_NEW[!str_detect(halfseLambda_coef_NEW$VariableName, pattern = halfseLambanomsWithMissingClim[i]),]
}
halfseLambda_coef <- halfseLambda_coef_NEW
}
}
##
}
##
## fit w/ the identified coefficients from the 'best' lambda, but using the glm function
mat_glmnet_best <- bestLambda_coef$VariableName
if (length(mat_glmnet_best) == 0) {
f_glm_bestLambda <- as.formula(paste0(response, "~ 1"))
} else {
f_glm_bestLambda <- as.formula(paste0(response, " ~ ", paste0(mat_glmnet_best, collapse = " + ")))
}
## fit using betareg
fit_glm_bestLambda <- fit_glm_bestLambda_quasiBETA <- glm(formula = f_glm_bestLambda, data = modDat_1_s, family =quasibeta_family(link = "logit") )
#fit_glm_bestLambda_BETAREG <- betareg(formula = f_glm_bestLambda, data = modDat_1_s, link = "logit")
## fit using quasi-beta
# ## compare coefficients
# modCoeffsTest <- data.frame("coefficientName" = names(coefficients(fit_glm_bestLambda_BETAREG)),
# "coefficientValue" = coefficients(fit_glm_bestLambda_BETAREG),
# "confint_low" = confint(fit_glm_bestLambda_BETAREG)[,1],
# "confint_high" = confint(fit_glm_bestLambda_BETAREG)[,2],
# "type" = "betareg") %>%
# rbind(data.frame("coefficientName" = names(coefficients(fit_glm_bestLambda_quasiBETA)),
# "coefficientValue" = coefficients(fit_glm_bestLambda_quasiBETA),
# "confint_low" = coefficients(fit_glm_bestLambda_quasiBETA) - ( summary(fit_glm_bestLambda_quasiBETA)$coefficients[,2] * 1.96) ,
# "confint_high" = coefficients(fit_glm_bestLambda_quasiBETA) + ( summary(fit_glm_bestLambda_quasiBETA)$coefficients[,2] * 1.96),
# "type" = "quasiBeta")) %>%
# filter(coefficientName != "(phi)")
# ggplot(data = modCoeffsTest) +
# geom_point(aes(x = coefficientValue, y = coefficientName, col = type)) +
# geom_errorbar(aes(xmin = confint_low, xmax = confint_high, y = coefficientName, col = type)) +
# geom_vline(aes(xintercept = 0), lty = 3)
#
## fit w/ the identified coefficients from the '1se' lambda, but using the glm function
mat_glmnet_1se <- seLambda_coef$VariableName
if (length(mat_glmnet_1se) == 0) {
f_glm_1se <- as.formula(paste0(response, "~ 1"))
} else {
f_glm_1se <- as.formula(paste0(response, " ~ ", paste0(mat_glmnet_1se, collapse = " + ")))
}
fit_glm_se <- glm(formula = f_glm_1se, data = modDat_1_s, family =quasibeta_family(link = "logit") )
# glm(data = modDat_1_s, formula = f_glm_1se,
# family = stats::Gamma(link = "log"))
## fit w/ the identified coefficients from the '.5se' lambda, but using the glm function
mat_glmnet_halfse <- halfseLambda_coef$VariableName
if (length(mat_glmnet_halfse) == 0) {
f_glm_halfse <- as.formula(paste0(response, "~ 1"))
} else {
f_glm_halfse <- as.formula(paste0(response, " ~ ", paste0(mat_glmnet_halfse, collapse = " + ")))
}
fit_glm_halfse <- glm(formula = f_glm_halfse, data = modDat_1_s, family =quasibeta_family(link = "logit") )
## save models
if (trimAnom == TRUE) {
saveRDS(fit_glm_bestLambda, paste0("./models/betaLASSO/",response,"_",ecoregion, "_noTLP_",removeTLP,"_removeAnomalies", removeAllAnoms,"_trimAnom_bestLambdaGLM.rds"))
saveRDS(fit_glm_halfse, paste0("./models/betaLASSO/",response,"_",ecoregion, "_noTLP_",removeTLP,"_removeAnomalies", removeAllAnoms,"_trimAnom_halfSELambdaGLM.rds"))
saveRDS(fit_glm_se, paste0("./models/betaLASSO/",response,"_",ecoregion, "_noTLP_",removeTLP,"_removeAnomalies", removeAllAnoms,"_trimAnom_oneSELambdaGLM.rds"))
} else {
saveRDS(fit_glm_bestLambda, paste0("./models/betaLASSO/",response,"_",ecoregion, "_noTLP_",removeTLP,"_removeAnomalies", removeAllAnoms,"_bestLambdaGLM.rds"))
saveRDS(fit_glm_halfse, paste0("./models/betaLASSO/",response,"_",ecoregion, "_noTLP_",removeTLP,"_removeAnomalies", removeAllAnoms,"_halfSELambdaGLM.rds"))
saveRDS(fit_glm_se, paste0("./models/betaLASSO/",response,"_",ecoregion, "_noTLP_",removeTLP,"_removeAnomalies", removeAllAnoms,"_oneSELambdaGLM.rds"))
}
## save the R environment after running the models
save(f_glm_halfse, mat_glmnet_halfse, halfseLambda_coef,
f_glm_1se, mat_glmnet_1se, seLambda_coef,
f_glm_bestLambda, mat_glmnet_best, bestLambda_coef,
file = paste0("./models/interimModelFittingObjects_", response, "_", ecoregion, "_removeAnomalies", removeAllAnoms,"_betaReg.rds" ))
} else {
# read in LASSO object
fit <- readRDS(paste0("../ModelFitting/models/betaLASSO/", response, "_globalLASSOmod_betaLogitLink_",ecoregion,"_removeAnomalies", removeAllAnoms,".rds"))
# read in R objects having to do w/ model fitting
load(file = paste0("./models/interimModelFittingObjects_", response, "_", ecoregion, "_removeAnomalies", removeAllAnoms,"_betaReg.rds" ))
if (trimAnom == TRUE) {
fit_glm_bestLambda <- readRDS( paste0("./models/betaLASSO/",response,"_",ecoregion, "_noTLP_",removeTLP,"_removeAnomalies", removeAllAnoms,"_trimAnom_bestLambdaGLM.rds"))
fit_glm_halfse <- readRDS( paste0("./models/betaLASSO/",response,"_",ecoregion, "_noTLP_",removeTLP,"_removeAnomalies", removeAllAnoms,"_trimAnom_halfSELambdaGLM.rds"))
fit_glm_se <- readRDS( paste0("./models/betaLASSO/",response,"_",ecoregion, "_noTLP_",removeTLP,"_removeAnomalies", removeAllAnoms,"_trimAnom_oneSELambdaGLM.rds"))
} else {
fit_glm_bestLambda <- readRDS( paste0("./models/betaLASSO/",response,"_",ecoregion, "_noTLP_",removeTLP,"_removeAnomalies", removeAllAnoms,"_bestLambdaGLM.rds"))
fit_glm_halfse <- readRDS( paste0("./models/betaLASSO/",response,"_",ecoregion, "_noTLP_",removeTLP,"_removeAnomalies", removeAllAnoms,"_halfSELambdaGLM.rds"))
fit_glm_se <- readRDS( paste0("./models/betaLASSO/",response,"_",ecoregion, "_noTLP_",removeTLP,"_removeAnomalies", removeAllAnoms,"_oneSELambdaGLM.rds"))
}
}
# assess model fit
# assess.glmnet(fit$fit.preval, #newx = X[,2:293],
# newy = y, family = stats::Gamma(link = "log"))
# save the minimum lambda
best_lambda <- fit$lambda.min
# save the lambda for the most regularized model w/ an MSE that is still 1SE w/in the best lambda model
lambda_1SE <- fit$lambda.1se
# save the lambda for the most regularized model w/ an MSE that is still .5SE w/in the best lambda model
lambda_halfSE <- best_lambda + ((lambda_1SE - best_lambda)/2)
print(fit)
##
## Call: cv.glmnet(x = X[, 2:ncol(X)], y = y, lambda = lambdas, type.measure = "mse", foldid = my_folds, keep = FALSE, parallel = TRUE, relax = ifelse(response == "ShrubCover", yes = TRUE, no = FALSE), family = quasibeta_family(), alpha = 1, standardize = FALSE)
##
## Measure: Mean-Squared Error
##
## Lambda Index Measure SE Nonzero
## min 0.00052 143 0.06319 0.003181 112
## 1se 0.05873 75 0.06626 0.002970 5
plot(fit)
Then, we predict (on the training set) using both of these models (best lambda and 1se lambda)
## predict on the test data
# lasso model predictions with the optimal lambda
optimal_pred <- predict(fit_glm_bestLambda, newx=X[,2:ncol(X)], type = "response")
optimal_pred_1se <- predict(fit_glm_se, newx=X[,2:ncol(X)], type = "response")
optimal_pred_halfse <- predict(fit_glm_halfse, newx = X[,2:ncol(X)], type = "response")
null_fit <- glm(#data = data.frame("y" = y, X[,paste0(prednames, "_s")]),
formula = y ~ 1, #data = modDat_1_s,
family = quasibeta_family(link = "logit")
#formula = y ~ 1, family = stats::Gamma(link = "log")
)
null_pred <- predict(null_fit, newdata = as.data.frame(X), type = "response"
)
# ## snap values above 100 and below 0 to be 100 or z
# #optimal_pred[optimal_pred>100] <- 100
# optimal_pred[optimal_pred<0] <- 0
# #optimal_pred_1se[optimal_pred_1se>100] <- 100
# optimal_pred_1se[optimal_pred_1se<0] <- 0
# #optimal_pred_halfse[optimal_pred_halfse>100] <- 100
# optimal_pred_halfse[optimal_pred_halfse<0] <- 0
#
# save data
fullModOut <- list(
"modelObject" = fit,
"nullModelObject" = null_fit,
"modelPredictions" = data.frame(#ecoRegion_holdout = rep(test_eco,length(y)),
obs=y,
pred_opt=optimal_pred,
pred_opt_se = optimal_pred_1se,
pred_opt_halfse = optimal_pred_halfse,
pred_null=null_pred#,
#pred_nopenalty=nopen_pred
))
ggplot() +
geom_point(aes(X[,2], fullModOut$modelPredictions$obs), col = "black", alpha = .1) +
geom_point(aes(X[,2], fullModOut$modelPredictions$pred_opt), col = "red", alpha = .1) + ## predictions w/ the CV model
geom_point(aes(X[,2], fullModOut$modelPredictions$pred_opt_halfse), col = "orange", alpha = .1) + ## predictions w/ the CV model (.5se lambda)
geom_point(aes(X[,2], fullModOut$modelPredictions$pred_opt_se), col = "green", alpha = .1) + ## predictions w/ the CV model (1se lambda)
geom_point(aes(X[,2], fullModOut$modelPredictions$pred_null), col = "blue", alpha = .1) +
labs(title = "A rough comparison of observed and model-predicted values",
subtitle = "black = observed values \n red = predictions from 'best lambda' model \n orange = predictions for '1/2se' lambda model \n green = predictions from '1se' lambda model \n blue = predictions from null model") +
xlab(colnames(X)[2])
#ylim(c(0,200))
The internal cross-validation process to fit the global LASSO model
identified an optimal lambda value (regularization parameter) of
r{print(best_lambda)}. The lambda value such that the cross
validation error is within 1 standard error of the minimum (“1se
lambda”) was `r{print(fit$lambda.1se)}`` . The following coefficients
were kept in each model:
# the coefficient matrix from the 'best model' -- find and print those coefficients that aren't 0 in a table
# the coefficient matrix from the 'best model' -- find and print those coefficients that aren't 0 in a table
coef_glm_bestLambda <- coef(fit_glm_bestLambda) %>%
data.frame()
coef_glm_bestLambda$coefficientName <- rownames(coef_glm_bestLambda)
names(coef_glm_bestLambda)[1] <- "coefficientValue_bestLambda"
# coefficient matrix from the '1se' model
coef_glm_1se <- coef(fit_glm_se) %>%
data.frame()
coef_glm_1se$coefficientName <- rownames(coef_glm_1se)
names(coef_glm_1se)[1] <- "coefficientValue_1seLambda"
# coefficient matrix from the 'half se' model
coef_glm_halfse <- coef(fit_glm_halfse) %>%
data.frame()
coef_glm_halfse$coefficientName <- rownames(coef_glm_halfse)
names(coef_glm_halfse)[1] <- "coefficientValue_halfseLambda"
# add together
coefs <- full_join(coef_glm_bestLambda, coef_glm_halfse) %>%
full_join(coef_glm_1se) %>%
dplyr::select(coefficientName, coefficientValue_bestLambda,
coefficientValue_halfseLambda, coefficientValue_1seLambda)
globModTerms <- coefs[!is.na(coefs$coefficientValue_bestLambda), "coefficientName"]
## also, get the number of unique variables in each model
var_prop_pred <- paste0(response, "_pred")
response_vars <- c(response, var_prop_pred)
# for best lambda model
prednames_fig <- paste(str_split(globModTerms, ":", simplify = TRUE))
prednames_fig <- str_replace(prednames_fig, "I\\(", "")
prednames_fig <- str_replace(prednames_fig, "\\^2\\)", "")
prednames_fig <- unique(prednames_fig[prednames_fig>0])
prednames_fig <- prednames_fig
prednames_fig_num <- length(prednames_fig)
# for 1SE lambda model
globModTerms_1se <- coefs[!is.na(coefs$coefficientValue_1seLambda), "coefficientName"]
if (length(globModTerms_1se) == 1) {
prednames_fig_1se <- paste(str_split(globModTerms_1se, ":", simplify = TRUE))
prednames_fig_1se <- str_replace(prednames_fig_1se, "I\\(", "")
prednames_fig_1se <- str_replace(prednames_fig_1se, "\\^2\\)", "")
prednames_fig_1se <- unique(prednames_fig_1se[prednames_fig_1se>0])
prednames_fig_1se_num <- c(0)
} else {
prednames_fig_1se <- paste(str_split(globModTerms_1se, ":", simplify = TRUE))
prednames_fig_1se <- str_replace(prednames_fig_1se, "I\\(", "")
prednames_fig_1se <- str_replace(prednames_fig_1se, "\\^2\\)", "")
prednames_fig_1se <- unique(prednames_fig_1se[prednames_fig_1se>0])
prednames_fig_1se_num <- length(prednames_fig_1se)
}
# for 1/2SE lambda model
globModTerms_halfse <- coefs[!is.na(coefs$coefficientValue_halfseLambda), "coefficientName"]
if (length(globModTerms_halfse) == 1) {
prednames_fig_halfse <- paste(str_split(globModTerms_halfse, ":", simplify = TRUE))
prednames_fig_halfse <- str_replace(prednames_fig_halfse, "I\\(", "")
prednames_fig_halfse <- str_replace(prednames_fig_halfse, "\\^2\\)", "")
prednames_fig_halfse <- unique(prednames_fig_halfse[prednames_fig_halfse>0])
prednames_fig_halfse_num <- c(0)
} else {
prednames_fig_halfse <- paste(str_split(globModTerms_halfse, ":", simplify = TRUE))
prednames_fig_halfse <- str_replace(prednames_fig_halfse, "I\\(", "")
prednames_fig_halfse <- str_replace(prednames_fig_halfse, "\\^2\\)", "")
prednames_fig_halfse <- unique(prednames_fig_halfse[prednames_fig_halfse>0])
prednames_fig_halfse_num <- length(prednames_fig_halfse)
}
# make a table
kable(coefs, col.names = c("Coefficient Name", "Value from best lambda model",
"Value from 1/2 se lambda", "Value from 1se lambda model")
) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
| Coefficient Name | Value from best lambda model | Value from 1/2 se lambda | Value from 1se lambda model |
|---|---|---|---|
| (Intercept) | -1.0770451 | -0.8403427 | -0.8403427 |
| tmean | 0.0685637 | NA | NA |
| prcp | 0.3944507 | 0.4771444 | 0.4771444 |
| prcp_dry | -0.0191202 | NA | NA |
| prcpTempCorr | -0.2352540 | NA | NA |
| isothermality | 0.0522266 | NA | NA |
| annWatDef | -0.6082585 | NA | NA |
| sand | 0.0994695 | NA | NA |
| coarse | 0.0901260 | NA | NA |
| carbon | 0.0673373 | 0.0666989 | 0.0666989 |
| AWHC | -0.0143945 | NA | NA |
| annWatDef_anom | 0.0183952 | NA | NA |
| I(tmean^2) | -0.1775559 | NA | NA |
| I(prcpTempCorr_anom^2) | 0.0094608 | NA | NA |
| I(isothermality_anom^2) | 0.0115196 | NA | NA |
| I(annWatDef_anom^2) | -0.0004953 | NA | NA |
| I(sand^2) | -0.0786910 | NA | NA |
| I(AWHC^2) | -0.1136888 | NA | NA |
| annWatDef_anom:isothermality_anom | 0.0206703 | NA | NA |
| prcpTempCorr:annWatDef_anom | 0.0073774 | NA | NA |
| annWatDef_anom:prcpTempCorr_anom | 0.0039794 | NA | NA |
| tmean:annWatDef_anom | 0.0463430 | NA | NA |
| isothermality:isothermality_anom | 0.0072614 | NA | NA |
| prcpTempCorr:isothermality | -0.0237655 | NA | NA |
| isothermality:prcpTempCorr_anom | 0.0530240 | NA | NA |
| tmean:isothermality | 0.1596632 | NA | NA |
| prcp_dry:isothermality_anom | 0.0063983 | NA | NA |
| isothermality_anom:prcpTempCorr_anom | -0.0187943 | NA | NA |
| tmean:isothermality_anom | -0.0202017 | NA | NA |
| prcp:prcpTempCorr_anom | 0.0363428 | NA | NA |
| prcp_dry:prcpTempCorr | 0.0553310 | NA | NA |
| tmean:prcp_dry | -0.0992651 | -0.1299574 | -0.1299574 |
| tmean:prcpTempCorr | -0.0641077 | NA | NA |
| tmean:prcpTempCorr_anom | -0.0006987 | NA | NA |
| carbon:AWHC | 0.0447461 | NA | NA |
| coarse:AWHC | -0.0778266 | NA | NA |
| sand:AWHC | -0.2585662 | NA | NA |
| coarse:carbon | 0.0028084 | NA | NA |
| sand:carbon | -0.0202561 | NA | NA |
| sand:coarse | -0.1021370 | NA | NA |
| carbon:coarse | NA | 0.0096471 | 0.0096471 |
# calculate RMSE of all models
RMSE_best <- yardstick::rmse(fullModOut$modelPredictions[,c("obs", "pred_opt")], truth = "obs", estimate = "pred_opt")$.estimate
RMSE_halfse <- yardstick::rmse(fullModOut$modelPredictions[,c("obs", "pred_opt_halfse")], truth = "obs", estimate = "pred_opt_halfse")$.estimate
RMSE_1se <- yardstick::rmse(fullModOut$modelPredictions[,c("obs", "pred_opt_se")], truth = "obs", estimate = "pred_opt_se")$.estimate
# calculate bias of all models
bias_best <- mean((fullModOut$modelPredictions$obs) - fullModOut$modelPredictions$pred_opt)
bias_halfse <- mean((fullModOut$modelPredictions$obs) - fullModOut$modelPredictions$pred_opt_halfse)
bias_1se <- mean((fullModOut$modelPredictions$obs) - fullModOut$modelPredictions$pred_opt_se)
uniqueCoeffs <- data.frame("Best lambda model" = c(signif(RMSE_best,3), as.character(signif(bias_best, 3)),
as.integer(length(globModTerms)-1), as.integer(prednames_fig_num),
as.integer(sum(prednames_fig %in% c(prednames_clim))),
as.integer(sum(prednames_fig %in% c(prednames_weath))),
as.integer(sum(prednames_fig %in% c(prednames_soils)))
),
"1/2 se lambda model" = c(signif(RMSE_halfse,3), as.character(signif(bias_halfse, 3)),
length(globModTerms_halfse)-1, prednames_fig_halfse_num,
sum(prednames_fig_halfse %in% c(prednames_clim)),
sum(prednames_fig_halfse %in% c(prednames_weath)),
sum(prednames_fig_halfse %in% c(prednames_soils))),
"1se lambda model" = c(signif(RMSE_1se,3), as.character(signif(bias_1se, 3)),
length(globModTerms_1se)-1, prednames_fig_1se_num,
sum(prednames_fig_1se %in% c(prednames_clim)),
sum(prednames_fig_1se %in% c(prednames_weath)),
sum(prednames_fig_1se %in% c(prednames_soils))))
row.names(uniqueCoeffs) <- c("RMSE", "bias: mean(obs-pred.)", "Total number of coefficients", "Number of unique coefficients",
"Number of unique climate coefficients",
"Number of unique weather coefficients",
"Number of unique soils coefficients"
)
kable(uniqueCoeffs,
col.names = c("Best lambda model", "1/2 se lambda model", "1se lambda model"), row.names = TRUE) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
| Best lambda model | 1/2 se lambda model | 1se lambda model | |
|---|---|---|---|
| RMSE | 0.236 | 0.242 | 0.242 |
| bias: mean(obs-pred.) | 2.98e-11 | 6.93e-13 | 6.93e-13 |
| Total number of coefficients | 39 | 4 | 4 |
| Number of unique coefficients | 13 | 5 | 5 |
| Number of unique climate coefficients | 6 | 3 | 3 |
| Number of unique weather coefficients | 3 | 0 | 0 |
| Number of unique soils coefficients | 4 | 2 | 2 |
As the alternative to the best lambda model, use the model (1se or 1/2se of best Lambda) that has the fewest number of unique predictors
if (whichSecondBestMod == "auto") {
# name of model w/ fewest # of predictors (but more than 0)
uniqueCoeff_min <- min(as.numeric(uniqueCoeffs[4,2:3])[which(as.numeric(uniqueCoeffs[4,2:3]) > 0)])
alternativeModel <- names(uniqueCoeffs[4,2:3])[which(uniqueCoeffs[4,2:3] == uniqueCoeff_min)]
if (is.finite(uniqueCoeff_min)) {
if (length(alternativeModel) == 1) {
if (alternativeModel == "X1.2.se.lambda.model") {
mod_secondBest <- fit_glm_halfse
name_secondBestMod <- "1/2 SE Model"
prednames_secondBestMod <- prednames_fig_halfse
} else if (alternativeModel == "X1se.lambda.model") {
mod_secondBest <- fit_glm_se
name_secondBestMod <- "1 SE Model"
prednames_secondBestMod <- prednames_fig_1se
}
} else {
# if both alternative models have the same number of unique coefficients, chose the model that has the fewest number of total predictors
uniqueCoeff_min2 <- min(as.numeric(uniqueCoeffs[3,alternativeModel]))
alternativeModel2 <- names(uniqueCoeffs[3,alternativeModel])[which(uniqueCoeffs[3,alternativeModel] == uniqueCoeff_min2)]
# if both models still have the same number of unique coefficeints, chose the 1se lambda model
if (length(alternativeModel2 == 2)) {
alternativeModel2 <- "X1se.lambda.model"
}
if (alternativeModel2 == "X1.2.se.lambda.model") {
mod_secondBest <- fit_glm_halfse
name_secondBestMod <- "1/2 SE Model"
prednames_secondBestMod <- prednames_fig_halfse
} else if (alternativeModel2 == "X1se.lambda.model") {
mod_secondBest <- fit_glm_se
name_secondBestMod <- "1 SE Model"
prednames_secondBestMod <- prednames_fig_1se
}
}
}else {
mod_secondBest <- NULL
name_secondBestMod <- "Intercept_Only"
prednames_secondBestMod <- NULL
}
} else if (whichSecondBestMod == "1se") {
mod_secondBest <- fit_glm_se
name_secondBestMod <- "1 SE Model"
prednames_secondBestMod <- prednames_fig_1se
} else if (whichSecondBestMod == "halfse") {
mod_secondBest <- fit_glm_halfse
name_secondBestMod <- "1/2 SE Model"
prednames_secondBestMod <- prednames_fig_halfse
}
# create prediction for each each model
# (i.e. for each fire proporation variable)
predict_by_response <- function(mod, df) {
df_out <- df
response_name <- paste0(response, "_pred")
preds <- predict(mod, newx= df_out, #s="lambda.min",
type = "response")
preds[preds<0] <- 0
#preds[preds>100] <- 100
df_out <- df_out %>% cbind(preds)
colnames(df_out)[ncol(df_out)] <- response_name
return(df_out)
}
pred_glm1 <- predict_by_response(fit_glm_bestLambda, X[,2:ncol(X)])
## back-transform the
# add back in true y values
pred_glm1 <- pred_glm1 %>%
cbind(data.frame(response = modDat_1_s[,c(response)]))
names(pred_glm1)[names(pred_glm1) == "response"] <- response
# add back in lat/long data
pred_glm1 <- pred_glm1 %>%
cbind(modDat_1_s[,c("x", "y", "Year")])
pred_glm1$resid <- pred_glm1[,response] - pred_glm1[,paste0(response, "_pred")]
pred_glm1$extremeResid <- NA
pred_glm1[pred_glm1$resid > 1 | pred_glm1$resid < 1,"extremeResid"] <- 1
if ( name_secondBestMod == "Intercept_Only") {
print("The next best lambda model only contains one predictor (an intercept)")
} else {
pred_glm1_1se <- predict_by_response(mod_secondBest, X[,2:ncol(X)])
# add back in true y values
pred_glm1_1se <- pred_glm1_1se %>%
cbind(data.frame(response = modDat_1_s[,c(response)]))
names(pred_glm1_1se)[names(pred_glm1_1se) == "response"] <- response
# add back in lat/long data
pred_glm1_1se <- pred_glm1_1se %>%
cbind(modDat_1_s[,c("x", "y", "Year")])
pred_glm1_1se$resid <- pred_glm1_1se[,response] - pred_glm1_1se[,paste0(response, "_pred")]
pred_glm1_1se$extremeResid <- NA
pred_glm1_1se[pred_glm1_1se$resid > 1 | pred_glm1_1se$resid < -1,"extremeResid"] <- 1
}
Observations across the temporal range of the dataset
# rasterize
# get reference raster
# use the gridMet raster
test_rast_dayMet <- rast("../../../Data_raw/dayMet/rawMonthlyData/orders/70e0da02b9d2d6e8faa8c97d211f3546/Daymet_Monthly_V4R1/data/daymet_v4_prcp_monttl_na_1980.tif") %>%
#terra::project(terra::crs("EPSG:4326")) %>%
terra::crop(ext(-2000000, 2600000, -1900000, 1500000))
## aggregate to a 4x4km^2 grid
test_rast <- test_rast_dayMet %>%
terra::aggregate(fact = 8, fun = "mean")
# test_rast_Untranfsormed <- rast("../../../Data_raw/gridMet/pr_2020.nc") #%>% ## mean cell size of unaggregated grid size (16709380m^2)
# test_rast <- test_rast_Untranfsormed %>%
# terra::aggregate(fact = 2, fun = "mean") #%>%
regions <- sf::st_read(dsn = "../../../Data_raw/Level2Ecoregions/", layer = "NA_CEC_Eco_Level2")
## Reading layer `NA_CEC_Eco_Level2' from data source
## `/Users/astears/Documents/Dropbox_static/Work/NAU_USGS_postdoc/cleanPED/PED_vegClimModels/Data_raw/Level2Ecoregions'
## using driver `ESRI Shapefile'
## Simple feature collection with 2261 features and 8 fields
## Geometry type: POLYGON
## Dimension: XY
## Bounding box: xmin: -4334052 ymin: -3313739 xmax: 3324076 ymax: 4267265
## Projected CRS: Sphere_ARC_INFO_Lambert_Azimuthal_Equal_Area
regions <- regions %>%
st_transform(crs = st_crs(crs(test_rast))) %>%
st_make_valid()
# ecoregionLU <- data.frame("NA_L1NAME" = sort(unique(regions$NA_L1NAME)),
# "newRegion" = c(NA, "Forest", "dryShrubGrass",
# "dryShrubGrass", "Forest", "dryShrubGrass",
# "dryShrubGrass", "Forest", "Forest",
# "dryShrubGrass", "Forest", "Forest",
# "Forest", "Forest", "dryShrubGrass",
# NA
# ))
# goodRegions <- regions %>%
# left_join(ecoregionLU)
# mapRegions <- goodRegions %>%
# filter(!is.na(newRegion)) %>%
# group_by(newRegion) %>%
# summarise(geometry = sf::st_union(geometry)) %>%
# ungroup() %>%
# st_simplify(dTolerance = 1000) %>%
# sf::st_crop(c("xmin" = -2000000, "xmax" = 2600000, "ymin" = -1900000, "ymax" = 1500000))
# make shapefile of cropped state boundaries in appropriate crs
cropped_states_2 <- cropped_states %>%
st_transform(crs = crs(test_rast)) %>%
st_make_valid() %>%
sf::st_crop(c("xmin" = -2000000, "xmax" = 2600000, "ymin" = -1900000, "ymax" = 1500000))
# make a multi-layer raster w/ layers for each variable of interest
pred_glm1_sf <- pred_glm1 %>%
st_as_sf(coords = c("x", "y"), crs = crs("EPSG:4326")) %>%
st_transform(crs(test_rast)) %>%
terra::vect()
# rasterize the observations and predictions
testRast2 <- lapply(c(response, paste0(response, "_pred"), "resid"), FUN = function(x) {
# S4 method for class 'data.frame'
temp <- terra::rasterize(x = pred_glm1_sf, y = test_rast,
field = x, fun = mean, na.rm = TRUE)
names(temp) <- x
# rast(pred_glm1_full[,c("x", "y", x)] %>% drop_na(), type="xyz", crs=terra::crs(test_rast$`precipitation_amount_day=43829`), digits=6, extent=test_rast$`precipitation_amount_day=43829`)
return(temp)
}
)
pred_glm1_RAST <- c(testRast2[[1]], testRast2[[2]], testRast2[[3]])
# ## compare
# par(mfrow = c(2,1))
# terra::plot(pred_glm1_RAST$TotalTreeCover, maxcell = 30000000, main = "gridMet")
# terra::plot(pred_glm1_RAST_dayMet$TotalTreeCover, maxcell = 30000000, main = "dayMet")
# par(mfrow = c(1,1))
##
# make figure of observations
map_obs <- ggplot() +
geom_spatraster(data = pred_glm1_RAST, aes(fill = .data[[response]]), maxcell = 50000000) +
#geom_sf(data = pred_glm1_RAST, aes(col = TotalTreeCover)) +
# stat_summary_2d(data = plotObs, aes(x = x, y = y, z = TotalTreeCover), fun = mean, binwidth = .05) +
geom_sf(data=cropped_states_2 ,fill=NA ) +
labs(title = paste0("Observations of ", response, " in the ",ecoregion, " ecoregion")) +
scale_fill_gradient2(low = "brown",
mid = "wheat" ,
high = "forestgreen" ,
midpoint = 0, na.value = "white") +
coord_sf(datum = st_crs(pred_glm1_RAST)) +
# xlim(-104, -100) +
# ylim(30, 33)
xlim(-2000000, 2500000) +
ylim(-1800000, 900000)
# histogram of observations
hist_obs <- pred_glm1%>%
ggplot() +
geom_histogram(aes(.data[[response]]), fill = "lightgrey", col = "darkgrey")
library(ggpubr)
ggarrange( map_obs, hist_obs, heights = c(3, 1), ncol = 1)
#
# # rasterize
# # get reference raster
# test_rast <- rast("../../../Data_raw/dayMet/rawMonthlyData/orders/70e0da02b9d2d6e8faa8c97d211f3546/Daymet_Monthly_V4R1/data/daymet_v4_prcp_monttl_na_1980.tif") %>%
# terra::aggregate(fact = 8, fun = "mean")
# ## add ecoregion boundaries (for our ecoregion level model)
# regions <- sf::st_read(dsn = "../../../Data_raw/Level2Ecoregions/", layer = "NA_CEC_Eco_Level2")
# regions <- regions %>%
# st_transform(crs = st_crs(test_rast)) %>%
# st_make_valid()
#
# ecoregionLU <- data.frame("NA_L1NAME" = sort(unique(regions$NA_L1NAME)),
# "newRegion" = c(NA, "Forest", "dryShrubGrass",
# "dryShrubGrass", "Forest", "dryShrubGrass",
# "dryShrubGrass", "Forest", "Forest",
# "dryShrubGrass", "Forest", "Forest",
# "Forest", "Forest", "dryShrubGrass",
# NA
# ))
# goodRegions <- regions %>%
# left_join(ecoregionLU)
# mapRegions <- goodRegions %>%
# filter(!is.na(newRegion)) %>%
# group_by(newRegion) %>%
# summarise(geometry = sf::st_union(geometry)) %>%
# ungroup() %>%
# st_simplify(dTolerance = 1000)
# #mapview(mapRegions)
# # rasterize data
# plotObs <- pred_glm1 %>%
# drop_na(paste(response)) %>%
# #slice_sample(n = 5e4) %>%
# terra::vect(geom = c("Long", "Lat")) %>%
# terra::set.crs(crs(test_rast)) %>%
# terra::rasterize(y = test_rast,
# field = response,
# fun = mean) #%>%
# #terra::aggregate(fact = 2, fun = mean, na.rm = TRUE) %>%
# #terra::crop(ext(-1950000, 1000000, -1800000, 1000000))
#
# # get the extent of this particular raster, and crop it accordingly
# tempExt <- crds(plotObs, na.rm = TRUE)
#
# plotObs_2 <- plotObs %>%
# crop(ext(min(tempExt[,1]), max(tempExt[,1]),
# min(tempExt[,2]), max(tempExt[,2]))
# )
# # make figures
# map_obs <- ggplot() +
# geom_spatraster(data = plotObs_2) +
# geom_sf(data=cropped_states %>% st_transform(crs = st_crs(test_rast)) %>% st_crop(st_bbox(plotObs_2)),fill=NA ) +
# geom_sf(data = mapRegions, fill = NA, col = "orchid", lwd = .5) +
# labs(title = paste0("Observations of ", response, " in the ",ecoregion, " ecoregion")) +
# scale_fill_gradient2(low = "brown",
# mid = "wheat" ,
# high = "darkgreen" ,
# midpoint = 0, na.value = "grey20") +
# xlim(st_bbox(plotObs_2)[c(1,3)]) +
# ylim(st_bbox(plotObs_2)[c(2,4)])
#
# hist_obs <- ggplot(pred_glm1) +
# geom_histogram(aes(.data[[response]]), fill = "lightgrey", col = "darkgrey")
#
# library(ggpubr)
# ggarrange(map_obs, hist_obs, heights = c(3,1), ncol = 1)
Predictions across the temporal range of the dataset
map_preds1 <- ggplot() +
geom_spatraster(data = pred_glm1_RAST, aes(fill = .data[[paste0(response,"_pred")]]), maxcell = 50000000) +
#geom_sf(data = highPred_points, col = "red") +
geom_sf(data=cropped_states_2 ,fill=NA ) +
labs(title = paste0("Predictions from the 'best lambda' fitted model of ", response, " in the ",ecoregion, " ecoregion"),
subtitle = "bestLambda model") +
scale_fill_gradient2(low = "wheat",
mid = "darkgreen",
high = "red" ,
midpoint = 1, na.value = "white",
limits = c(0,1)) +
xlim(-2000000, 2500000) +
ylim(-1800000, 900000)
hist_preds1 <- ggplot(pred_glm1) +
geom_histogram(aes(.data[[paste0(response, "_pred")]]), fill = "lightgrey", col = "darkgrey")+
xlim(c(0,1))
ggarrange(map_preds1, hist_preds1, heights = c(3,1), ncol = 1)
if ( name_secondBestMod == "Intercept_Only") {
print("The next best lambda model only contains one predictor (an intercept)")
} else {
# rasterize data
# make a multi-layer raster w/ layers for each variable of interest
pred_glm1_1se_sf <- pred_glm1_1se %>%
st_as_sf(coords = c("x", "y"), crs = crs("EPSG:4326")) %>%
st_transform(crs(test_rast)) %>%
terra::vect()
# rasterize the observations and predictions
testRast3 <- lapply(c(response, paste0(response, "_pred"), "resid"), FUN = function(x) {
# S4 method for class 'data.frame'
temp <- terra::rasterize(x = pred_glm1_1se_sf, y = test_rast,
field = x, fun = mean, na.rm = TRUE)
names(temp) <- x
# rast(pred_glm1_full[,c("x", "y", x)] %>% drop_na(), type="xyz", crs=terra::crs(test_rast$`precipitation_amount_day=43829`), digits=6, extent=test_rast$`precipitation_amount_day=43829`)
return(temp)
}
)
pred_glm1_1se_RAST <- c(testRast3[[1]], testRast3[[2]], testRast3[[3]])
# make figures
map_preds2 <- ggplot() +
geom_spatraster(data = pred_glm1_RAST, aes(fill = .data[[paste0(response,"_pred")]]), maxcell = 50000000) +
geom_sf(data=cropped_states_2 ,fill=NA ) +
labs(title = paste0("Predictions from the ", name_secondBestMod, "fitted model of ", response, " in the ",ecoregion, " ecoregion"),
subtitle = name_secondBestMod) +
scale_fill_gradient2(low = "wheat",
mid = "darkgreen",
high = "red" ,
midpoint = 1, na.value = "white",
limits = c(0, 1)) +
xlim(-2000000, 2500000) +
ylim(-1800000, 900000)
hist_preds2 <- ggplot(pred_glm1_1se) +
geom_histogram(aes(.data[[paste0(response, "_pred")]]), fill = "lightgrey", col = "darkgrey")+
xlim(c(0,1))
ggarrange(map_preds2, hist_preds2, heights = c(3,1), ncol = 1)
}
# identify locations where residuals are >100 or < -100
badResids_high <- pred_glm1 %>%
filter(resid > .5) %>%
terra::vect(geom = c("x", "y")) %>%
terra::set.crs(crs("EPSG:4326")) %>%
terra::project(crs(test_rast))
badResids_low <- pred_glm1 %>%
filter(resid < -.5) %>%
terra::vect(geom = c("x", "y")) %>%
terra::set.crs(crs("EPSG:4326")) %>%
terra::project(crs(test_rast))
# make figures
map <- ggplot() +
geom_spatraster(data = pred_glm1_RAST, aes(fill = resid), maxcell = 50000000) +
geom_sf(data=cropped_states_2 ,fill=NA ) +
geom_sf(data = badResids_high, col = "blue") +
geom_sf(data = badResids_low, col = "red") +
labs(title = paste0("Resids. (obs. - pred.) from the ", ecoregion," ecoregion-wide model of ", response),
subtitle = "bestLambda model \n red points indicate locations that have residuals below -0.5 \n blue points indicate locations that have residuals above 0.5") +
scale_fill_gradient2(low = "red",
mid = "white" ,
high = "blue" ,
midpoint = 0, na.value = "grey30",
limits = c(-1,1)
) +
xlim(-2000000, 2500000) +
ylim(-1800000, 900000)
hist <- ggplot(pred_glm1) +
geom_histogram(aes(resid), fill = "lightgrey", col = "darkgrey") +
geom_text(aes(x = min(resid)*.9, y = 1500, label = paste0("min = ", round(min(resid),2)))) +
geom_text(aes(x = max(resid)*.9, y = 1500, label = paste0("max = ", round(max(resid),2)))) +
geom_vline(aes(xintercept = mean(resid)))
ggarrange(map, hist, heights = c(3,1), ncol = 1)
if ( name_secondBestMod == "Intercept_Only") {
print("The next best lambda model only contains one predictor (an intercept)")
} else {
# identify locations where residuals are >100 or < -100
badResids_high <- pred_glm1_1se %>%
filter(resid > .5) %>%
terra::vect(geom = c("x", "y")) %>%
terra::set.crs(crs("EPSG:4326")) %>%
terra::project(crs(test_rast))
badResids_low <- pred_glm1_1se %>%
filter(resid < -.5) %>%
terra::vect(geom = c("x", "y")) %>%
terra::set.crs(crs("EPSG:4326")) %>%
terra::project(crs(test_rast))
# make figures
map <- ggplot() +
geom_spatraster(data = pred_glm1_1se_RAST, aes(fill = resid), maxcell = 50000000) +
geom_sf(data=cropped_states_2 ,fill=NA ) +
geom_sf(data = badResids_high, col = "blue") +
geom_sf(data = badResids_low, col = "red") +
labs(title = paste0("Resids. (obs. - pred.) from the ", ecoregion," ecoregion-wide model of ", response),
subtitle = "bestLambda model \n red points indicate locations that have residuals below -0.5 \n blue points indicate locations that have residuals above 0.5") +
scale_fill_gradient2(low = "red",
mid = "white" ,
high = "blue" ,
midpoint = 0, na.value = "grey30",
limits = c(-1,1)
) +
xlim(-2000000, 2500000) +
ylim(-1800000, 900000)
hist <- ggplot(pred_glm1_1se) +
geom_histogram(aes(resid), fill = "lightgrey", col = "darkgrey") +
geom_text(aes(x = min(resid)*.9, y = 1500, label = paste0("min = ", round(min(resid),2)))) +
geom_text(aes(x = max(resid)*.9, y = 1500, label = paste0("max = ", round(max(resid),2)))) +
geom_vline(aes(xintercept = mean(resid)))
ggarrange(map, hist, heights = c(3,1), ncol = 1)
}
if ( name_secondBestMod == "Intercept_Only") {
print("The next best lambda model only contains one predictor (an intercept)")
# plot residuals against Year
yearResidMod_bestLambda <- ggplot(pred_glm1) +
geom_point(aes(x = jitter(Year), y = resid), alpha = .1) +
geom_smooth(aes(x = Year, y = resid)) +
xlab("Year") +
ylab("Residuals") +
ggtitle("from best lamba model")
# plot residuals against Lat
latResidMod_bestLambda <- ggplot(pred_glm1) +
geom_point(aes(x = y, y = resid), alpha = .1) +
geom_smooth(aes(x = y, y = resid)) +
xlab("Latitude") +
ylab("Residuals") +
ggtitle("from best lamba model")
# plot residuals against Long
longResidMod_bestLambda <- ggplot(pred_glm1) +
geom_point(aes(x = x, y = resid), alpha = .1) +
geom_smooth(aes(x = x, y = resid)) +
xlab("Longitude") +
ylab("Residuals") +
ggtitle("from best lamba model")
library(patchwork)
(yearResidMod_bestLambda ) /
( latResidMod_bestLambda ) /
( longResidMod_bestLambda )
} else {
# plot residuals against Year
yearResidMod_bestLambda <- ggplot(pred_glm1) +
geom_point(aes(x = jitter(Year), y = resid), alpha = .1) +
geom_smooth(aes(x = Year, y = resid)) +
xlab("Year") +
ylab("Residuals") +
ggtitle("from best lamba model")
yearResidMod_1seLambda <- ggplot(pred_glm1_1se) +
geom_point(aes(x = jitter(Year), y = resid), alpha = .1) +
geom_smooth(aes(x = Year, y = resid)) +
xlab("Year") +
ylab("Residuals") +
ggtitle(paste0("from ", name_secondBestMod))
# plot residuals against Lat
latResidMod_bestLambda <- ggplot(pred_glm1) +
geom_point(aes(x = y, y = resid), alpha = .1) +
geom_smooth(aes(x = y, y = resid)) +
xlab("Latitude") +
ylab("Residuals") +
ggtitle("from best lamba model")
latResidMod_1seLambda <- ggplot(pred_glm1_1se) +
geom_point(aes(x = y, y = resid), alpha = .1) +
geom_smooth(aes(x = y, y = resid)) +
xlab("Latitude") +
ylab("Residuals") +
ggtitle(paste0("from ", name_secondBestMod))
# plot residuals against Long
longResidMod_bestLambda <- ggplot(pred_glm1) +
geom_point(aes(x = x, y = resid), alpha = .1) +
geom_smooth(aes(x = x, y = resid)) +
xlab("Longitude") +
ylab("Residuals") +
ggtitle("from best lamba model")
longResidMod_1seLambda <- ggplot(pred_glm1_1se) +
geom_point(aes(x = x, y = resid), alpha = .1) +
geom_smooth(aes(x = x, y = resid)) +
xlab("Longitude") +
ylab("Residuals") +
ggtitle(paste0("from ", name_secondBestMod))
library(patchwork)
(yearResidMod_bestLambda + yearResidMod_1seLambda) /
( latResidMod_bestLambda + latResidMod_1seLambda) /
( longResidMod_bestLambda + longResidMod_1seLambda)
}
Binning predictor variables into “Quantiles” and looking at the mean predicted probability for each percentile.
# get deciles for best lambda model
if (length(prednames_fig) == 0) {
print("The best lambda model only contains one predictor (an intercept), so decile plots aren't possible to generate")
} else {
pred_glm1_deciles <- predvars2deciles(pred_glm1,
response_vars = response_vars,
pred_vars = prednames_fig,
cut_points = seq(0, 1, 0.01))
}
# get deciles for 1 SE lambda model
if ( name_secondBestMod == "Intercept_Only") {
print("The next best lambda model only contains one predictor (an intercept)")} else {
pred_glm1_deciles_1se <- predvars2deciles(pred_glm1_1se,
response_vars = response_vars,
pred_vars = prednames_secondBestMod,
cut_points = seq(0, 1, 0.01))
}
Below are quantile plots for the best lambda model (note that the predictor variables are scaled)
if (length(prednames_fig) == 0) {
print("The model only contains one predictor (an intercept), so decile plots aren't possible to generate")
} else {
# publication quality version
g3 <- decile_dotplot_pq(df = pred_glm1_deciles, response = c(response, paste0(response, "_pred")), IQR = TRUE,
CI = FALSE
) + ggtitle("Decile Plot")
g4 <- add_dotplot_inset(g3, df = pred_glm1_deciles, response = c(response, paste0(response, "_pred")), dfRaw = pred_glm1, add_smooth = TRUE, deciles = FALSE)
if(save_figs) {
# png(paste0("../../../Figures/CoverDatFigures/ figures/quantile_plots/quantile_plot_", response, "_",ecoregion,".png"),
# units = "in", res = 600, width = 5.5, height = 3.5 )
# print(g4)
# dev.off()
}
g4
}
Below are percentile plots from the second best lambda model ()
if ( name_secondBestMod == "Intercept_Only") {
print("The next best lambda model only contains one predictor (an intercept)")
} else {
# publication quality version
g3 <- decile_dotplot_pq(pred_glm1_deciles_1se, response = c(response, paste0(response, "_pred")), IQR = TRUE) + ggtitle("Decile Plot")
g4 <- add_dotplot_inset(g3, df = pred_glm1_deciles_1se, response = c(response, paste0(response, "_pred")), dfRaw = pred_glm1_1se, add_smooth = TRUE, deciles = FALSE)
if(save_figs) {
# png(paste0("figures/quantile_plots/quantile_plot_", response, "_",ecoregion,".png"),
# units = "in", res = 600, width = 5.5, height = 3.5 )
# print(g4)
# dev.off()
}
g4
}
Filtered ‘Quantile’ plots of data. These plots show each vegetation variable, but only based on data that falls into the upper and lower 20th percentiles of each predictor variable.
if (length(prednames_fig) == 0) {
print("The model only contains one predictor (an intercept), so decile plots aren't possible to generate")
} else {
pred_glm1_deciles_filt <- predvars2deciles( pred_glm1,
response_vars = c(response, paste0(response, "_pred")),
pred_vars = prednames_fig,
filter_var = TRUE,
filter_vars = prednames_fig,
cut_points = seq(0, 1, 0.01))
g <- decile_dotplot_filtered_pq(df = pred_glm1_deciles_filt, response = response,
xvars = prednames_fig)
if(save_figs) {
# jpeg(paste0("figures/quantile_plots/quantile_plot_filtered_mid_v1", , ".jpeg"),
# units = "in", res = 600, width = 5.5, height = 6 )
# g
# dev.off()
}
g
}
## Processed 8732 groups out of 33494. 26% done. Time elapsed: 3s. ETA: 8s.Processed 12913 groups out of 33494. 39% done. Time elapsed: 4s. ETA: 6s.Processed 17105 groups out of 33494. 51% done. Time elapsed: 5s. ETA: 4s.Processed 21300 groups out of 33494. 64% done. Time elapsed: 6s. ETA: 3s.Processed 25423 groups out of 33494. 76% done. Time elapsed: 7s. ETA: 2s.Processed 29482 groups out of 33494. 88% done. Time elapsed: 8s. ETA: 1s.Processed 33494 groups out of 33494. 100% done. Time elapsed: 8s. ETA: 0s.
Filtered quantile figure with middle 2 deciles also shown
if ( name_secondBestMod == "Intercept_Only") {
print("The next best lambda model only contains one predictor (an intercept)")
} else {
pred_glm1_deciles_filt_mid <- predvars2deciles(pred_glm1,
response_vars = c(response, paste0(response, "_pred")),
pred_vars = prednames_fig,
filter_vars = prednames_fig,
filter_var = TRUE,
add_mid = TRUE,
cut_points = seq(0, 1, 0.01))
g <- decile_dotplot_filtered_pq(df = pred_glm1_deciles_filt_mid, response = response,
xvars = prednames_fig)
if(save_figs) {
# jpeg(paste0("figures/quantile_plots/quantile_plot_filtered_mid_v1", , ".jpeg"),
# units = "in", res = 600, width = 5.5, height = 6)
# g
# dev.off()
}
g
}
## Processed 11769 groups out of 50328. 23% done. Time elapsed: 3s. ETA: 9s.Processed 16265 groups out of 50328. 32% done. Time elapsed: 4s. ETA: 8s.Processed 20769 groups out of 50328. 41% done. Time elapsed: 5s. ETA: 7s.Processed 25234 groups out of 50328. 50% done. Time elapsed: 6s. ETA: 5s.Processed 29806 groups out of 50328. 59% done. Time elapsed: 7s. ETA: 4s.Processed 34349 groups out of 50328. 68% done. Time elapsed: 8s. ETA: 3s.Processed 38913 groups out of 50328. 77% done. Time elapsed: 9s. ETA: 2s.Processed 43360 groups out of 50328. 86% done. Time elapsed: 10s. ETA: 1s.Processed 47793 groups out of 50328. 95% done. Time elapsed: 11s. ETA: 0s.Processed 50328 groups out of 50328. 100% done. Time elapsed: 11s. ETA: 0s.
Filtered ‘Quantile’ plots of data. These plots show each vegetation variable, but only based on data that falls into the upper and lower 20th percentiles of each predictor variable.
if (length(prednames_secondBestMod) == 0) {
print("The model only contains one predictor (an intercept), so decile plots aren't possible to generate")
} else {
pred_glm1_deciles_filt <- predvars2deciles( pred_glm1_1se,
response_vars = c(response, paste0(response, "_pred")),
pred_vars = prednames_secondBestMod,
filter_var = TRUE,
filter_vars = prednames_secondBestMod,
cut_points = seq(0, 1, 0.01))
g <- decile_dotplot_filtered_pq(df = pred_glm1_deciles_filt, response = response,
xvars = prednames_fig)
if(save_figs) {
# jpeg(paste0("figures/quantile_plots/quantile_plot_filtered_mid_v1", , ".jpeg"),
# units = "in", res = 600, width = 5.5, height = 6 )
# g
# dev.off()
}
g
}